set.seed(12345)
source("ingest_data.R")
MODEL_CACHE <- "removed-M_added_removed_lines"
Models are stored in the following directory, which must exist prior to knitting this document:
cat(normalizePath(paste(getwd(), dirname(cachefile(MODEL_CACHE)), sep="/"), mustWork = T))
## /home/app/.cache
The used cache directory can be controlled via the cache parameter to Rmd - it can be useful to experiment with this parameter if you Knit the document manually in RStudio.
As we are modeling removals of duplicates, it seems logical that the number of existing duplicates in a file would affect the probability of duplicates being removed. After all, if the file contains no duplicates, it will be impossible to remove any, no matter how much you change the file. For the team and team-in-repository components, we will use added and removed lines as predictor.
This leaves us with the following model:
d <- data |> select(y=REMOVED,
A=A,
C=C,
D=D,
R=R,
team=committerteam,
repo=repo)
formula <- bf(y ~ 1 + D + A + R + (1 + A + R | team) + (1 + A + R | team:repo),
zi ~ 1 + D + A + R + (1 + A + R | team) + (1 + A + R | team:repo))
get_prior(data=d,
family=zero_inflated_negbinomial,
formula=formula)
## prior class coef group resp dpar nlpar lb ub
## (flat) b
## (flat) b A
## (flat) b D
## (flat) b R
## lkj(1) cor
## lkj(1) cor team
## lkj(1) cor team:repo
## student_t(3, -2.3, 2.5) Intercept
## student_t(3, 0, 2.5) sd 0
## student_t(3, 0, 2.5) sd team 0
## student_t(3, 0, 2.5) sd A team 0
## student_t(3, 0, 2.5) sd Intercept team 0
## student_t(3, 0, 2.5) sd R team 0
## student_t(3, 0, 2.5) sd team:repo 0
## student_t(3, 0, 2.5) sd A team:repo 0
## student_t(3, 0, 2.5) sd Intercept team:repo 0
## student_t(3, 0, 2.5) sd R team:repo 0
## gamma(0.01, 0.01) shape 0
## (flat) b zi
## (flat) b A zi
## (flat) b D zi
## (flat) b R zi
## logistic(0, 1) Intercept zi
## student_t(3, 0, 2.5) sd zi 0
## student_t(3, 0, 2.5) sd team zi 0
## student_t(3, 0, 2.5) sd A team zi 0
## student_t(3, 0, 2.5) sd Intercept team zi 0
## student_t(3, 0, 2.5) sd R team zi 0
## student_t(3, 0, 2.5) sd team:repo zi 0
## student_t(3, 0, 2.5) sd A team:repo zi 0
## student_t(3, 0, 2.5) sd Intercept team:repo zi 0
## student_t(3, 0, 2.5) sd R team:repo zi 0
## source
## default
## (vectorized)
## (vectorized)
## (vectorized)
## default
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
priors <- c(prior(normal(0, 0.5), class = Intercept),
prior(normal(0, 0.25), class = b),
prior(weibull(2, 0.25), class = sd),
prior(lkj(2), class = cor),
prior(normal(0, 0.5), class = Intercept, dpar=zi),
prior(normal(0, 0.25), class = b, dpar=zi),
prior(weibull(2, 0.25), class = sd, dpar=zi),
prior(gamma(0.5, 0.1), class = shape))
(v <- validate_prior(prior=priors,
formula=formula,
data=d,
family=zero_inflated_negbinomial)
)
## prior class coef group resp dpar nlpar lb ub
## normal(0, 0.25) b
## normal(0, 0.25) b A
## normal(0, 0.25) b D
## normal(0, 0.25) b R
## normal(0, 0.25) b zi
## normal(0, 0.25) b A zi
## normal(0, 0.25) b D zi
## normal(0, 0.25) b R zi
## normal(0, 0.5) Intercept
## normal(0, 0.5) Intercept zi
## lkj_corr_cholesky(2) L
## lkj_corr_cholesky(2) L team
## lkj_corr_cholesky(2) L team:repo
## weibull(2, 0.25) sd 0
## weibull(2, 0.25) sd zi 0
## weibull(2, 0.25) sd team 0
## weibull(2, 0.25) sd A team 0
## weibull(2, 0.25) sd Intercept team 0
## weibull(2, 0.25) sd R team 0
## weibull(2, 0.25) sd team zi 0
## weibull(2, 0.25) sd A team zi 0
## weibull(2, 0.25) sd Intercept team zi 0
## weibull(2, 0.25) sd R team zi 0
## weibull(2, 0.25) sd team:repo 0
## weibull(2, 0.25) sd A team:repo 0
## weibull(2, 0.25) sd Intercept team:repo 0
## weibull(2, 0.25) sd R team:repo 0
## weibull(2, 0.25) sd team:repo zi 0
## weibull(2, 0.25) sd A team:repo zi 0
## weibull(2, 0.25) sd Intercept team:repo zi 0
## weibull(2, 0.25) sd R team:repo zi 0
## gamma(0.5, 0.1) shape 0
## source
## user
## (vectorized)
## (vectorized)
## (vectorized)
## user
## (vectorized)
## (vectorized)
## (vectorized)
## user
## user
## user
## (vectorized)
## (vectorized)
## user
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
M_ppc <- brm(data = d,
family = zero_inflated_negbinomial,
formula = formula,
prior = priors,
file = cachefile(paste0("ppc-",MODEL_CACHE)),
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
sample_prior = "only",
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
m <- M_ppc
yrep <- posterior_predict(m)
ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0)) + ggtitle("Prior predicted proportion of zeros")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The proportion of zeros seem plausible.
(sim_max <- ppc_stat(y = d$y, yrep, stat = "max") + ggtitle("Prior predicted max values")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sim_max + scale_x_continuous(limits = c(0,1000)) + ggtitle("Prior predicted max values up to 1000")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Maximum predictions is slightly wider than the intercept model, but the observed count fall within our prior predictions.
ppc_stat(y = d$y, yrep, stat = "q99") + ggtitle("Prior predicted Q99 vs. observed value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Relative to the intercept model, our priors place a little more probability on seeing duplicates being removed. The range is similar, though, it is just the distribution that changes.
(p <- ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + theme(legend.position="bottom") + ggtitle("Prior predicted Q95 vs Q99")
)
Similar-looking 95th-vs-99th-percentile plot as the intercept model.
(p <- ppc_stat(y = d$y, yrep, stat = "sd") + ggtitle("Prior predicted stddev vs. observed value")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Relative to the intercept model, this model has a slightly wider standard deviation range.
p + scale_x_continuous(limits=c(0,30))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Wider prior predictions for most groups with these priors (Pink team being the exception).
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per repository")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
When grouping per repository, there is much larger variation. Note that our model does not have any repository-only predictor, only “team-in-repository” (the reason being that a repository is not acting on its own—all changes are performed by a team).
M_added_removed_lines <- brm(data = d,
family = zero_inflated_negbinomial,
file = cachefile(MODEL_CACHE),
formula = formula,
prior = priors,
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
M_added_removed_lines <- add_criterion(M_added_removed_lines, "loo")
m <- M_added_removed_lines
p <- mcmc_trace(m)
pars <- levels(p[["data"]][["parameter"]])
plots <- seq(1, to=length(pars), by=12)
lapply(plots, function(i) {
start <- i
end <- start+11
mcmc_trace(m, pars = na.omit(pars[start:end]))
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
Now we run into problem with the model convergence—we see that certain chains did not converge properly. For instance, the b_D and b_zi_D predictors did not converge at all, and the intercepts are also fishy, as is the shape.
mcmc_plot(m, type="rhat")
mcmc_plot(m, type="rhat_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mcmc_plot(m, type="neff")
mcmc_plot(m, type="neff_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We see also that some \(\hat{R}\) values are much higher than 1.
This means that the model did not converge properly, and we should not trust the posterior predictions.
loo <- loo(m)
loo
##
## Computed from 12000 by 31007 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -5636.0 137.2
## p_loo 337.2 26.8
## looic 11272.0 274.4
## ------
## MCSE of elpd_loo is NA.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.0, 1.6]).
##
## Pareto k diagnostic values:
## Count Pct. Min. ESS
## (-Inf, 0.7] (good) 30998 100.0% 0
## (0.7, 1] (bad) 4 0.0% <NA>
## (1, Inf) (very bad) 5 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
plot(loo)